
import "dotenv/config";
import { openai } from "@ai-sdk/openai";
import { movieAgent } from "../mastra/agents/movie-agent";
import { createOrGetDataset } from "@arizeai/phoenix-client/datasets";
import type { Example } from "@arizeai/phoenix-client/types/datasets";
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";



// Step 1: define the task to run (we call the agent with the question)
export async function task(example: Example): Promise<string> {
    const question = example.input.question as string;

    // Call the movie agent with the question
    const result = await movieAgent.generate(question);

    // Extract the text response from the result
    return result.text || "";
}

// Step 2: define the dataset of questions to ask the agent
const DATASET = [
    "Which horror movie should I watch next?",
    "Give me a good comedy movie to watch tonight.",
    "Recommend a comedy that is also a musical",
    "Show me a popular movie that didn’t do well at the box office",
    "What horror movies are not too violent",
    "Name a feel-good holiday movie",
    "Recommend a musical with great songs",
    "Give me a classic drama from the 90s",
    "Name a movie that is a classic action movie",
    "Which Batman movie should I watch?"
]

export const dataset = await createOrGetDataset({
    name: "movie-rec-questions",
    description: "Questions to ask a movie recommendation agent",
    examples: DATASET.map(question => ({
      input: {
        question: question,
      },
    })),
  });

// Step 3: Define the evaluators
const RECOMMENDATION_RELEVANCE = `
  You are evaluating the relevance of movie recommendations provided by an LLM application.

  You will be given:
  1. The user input that initiated the trace
  2. The list of movie recommendations output by the system

  ##
  User Input:
  {{input.question}}

  Recommendations:
  {{output}}
  ##

  Respond with exactly one word: \`correct\` or \`incorrect\`.
  1. \`correct\` →
  - All recommended movies match the requested genre or criteria in the user input.
  - The recommendations should be relevant to the user's request and shouldn't be repetitive.
  2.\`incorrect\` → one or more recommendations do not match the requested genre or criteria.
  `;

  
  export const recommendationRelevanceEvaluator = createClassificationEvaluator({
    name: "Relevance",
    model: openai("gpt-5"),
    promptTemplate: RECOMMENDATION_RELEVANCE,
    choices: {
      correct: 1,
      incorrect: 0,
    },
  });
    

